# Purpose: Produce some graphs that summarize how well our
# ID'ing of natives and migrants matches actual admin data

source("constants.R")

#####################################
## 1. Read in the admin / FB data ##
####################################

# See the README for instructions on how to access these data
kreis_demos <- read_csv("input/derived/population/kreis_total_pop_by_age_gender.csv")
lander_demos <- read_csv("input/derived/population/land_total_pop_by_age_gender.csv")

# See the README for instructions on how to access these data
kreis_xw <- read_csv("input/derived/nuts_xw/nuts3_DE_xw.csv")
lander_xw <- read_csv("input/derived/nuts_xw/nuts1_name_xw.csv")

kreis_to_exclude <- unique(filter(kreis_demos, year == 2019, is.na(frac_syr_tot))$ags)


##############################
## 2. Comparison by Kreis ##
#############################

# Get the share Syrian by group
kreis_demos_2019 <- full_join(

  # Syrian pops
  kreis_demos %>%
    filter(!is.na(pop_syr_m) & !is.na(pop_syr_f)) %>%
    filter(year == 2019) %>%
    select(ags, "pop_syr_m", "pop_syr_f") %>%
    gather(gender, n_sy, -ags) %>%
    mutate(
      gender = str_remove(gender, "pop_syr_")),

  # Total pops
  kreis_demos %>%
    filter(!is.na(pop_syr_m) & !is.na(pop_syr_f)) %>%
    filter(year == 2019) %>%
    select(ags, "pop_m_tot", "pop_f_tot") %>%
    gather(gender, n_total, -ags) %>%
    mutate(
      gender = str_remove(gender, "pop_"),
      gender = str_remove(gender, "_tot")),

  # Merge together
  by = c("ags", "gender")

)


#### First using the observe entry (loc) data only ####

# See the README for instructions on how to access this data
dat_in_kreis <- read_csv("input/pipeline/migration_fb_vs_true_kreis.csv")

kreis_combined_2019 <- dat_in_kreis %>%
  filter(tot_nuts3_sy_migrants >= 40) %>%
  left_join(kreis_xw, by=c("nuts3"="NUTS3")) %>%
  inner_join(kreis_demos_2019, by=c("ID"="ags", "gender"="gender")) %>%
  mutate(share_sy_true = n_sy/n_total)

# Compare measures (weighted by total pop)
wtd_cor <- wtd.cor(
  kreis_combined_2019$share_sy_true,
  kreis_combined_2019$share_sy_fb,
  kreis_combined_2019$n_total)[1]

ggplot(kreis_combined_2019, aes(x=share_sy_true, y=share_sy_fb, size=n_total)) +
  geom_point() +
  geom_abline(col="dark gray", linetype = "dashed", size=1.25) +
  geom_smooth(method='lm', se=F, mapping = aes(weight = n_total), show.legend = FALSE) +
  theme_classic() +
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor, 3)}\n(N = ${nrow(kreis_combined_2019)})"),
       size="County x \nGender Pop.",
       x = "Share SY Migrant, Admin Data",
       y = "Share SY Migrant, FB Data")

ggsave("output/fb_vs_true_kreis_loc.png", last_plot(), width=5, height=4)


#### Then do the same with all Syrian migrants ####

# See the README for instructions on how to access this data
dat_in_kreis.w_hometowns <- read_csv("input/pipeline/migration_fb_vs_true_kreis_all.csv")

kreis_combined_2019.w_hometowns <- dat_in_kreis.w_hometowns %>%
  filter(tot_nuts3_sy_migrants >= 40) %>%
  left_join(kreis_xw, by=c("nuts3"="NUTS3")) %>%
  inner_join(kreis_demos_2019, by=c("ID"="ags", "gender"="gender")) %>%
  mutate(share_sy_true = n_sy/n_total)

# Compare measures (weighted by total pop)
wtd_cor <- wtd.cor(
  kreis_combined_2019.w_hometowns$share_sy_true,
  kreis_combined_2019.w_hometowns$share_sy_fb,
  kreis_combined_2019.w_hometowns$n_total)[1]

ggplot(kreis_combined_2019.w_hometowns, aes(x=share_sy_true, y=share_sy_fb, size=n_total)) +
  geom_point() +
  geom_abline(col="dark gray", linetype = "dashed", size=1.25) +
  geom_smooth(method='lm', se=F, mapping = aes(weight = n_total), show.legend = FALSE) +
  theme_classic() +
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor, 3)}\n(N = ${nrow(kreis_combined_2019.w_hometowns)})"),
       size="County x \nGender Pop.",
       x = "Share SY Migrant, Admin Data",
       y = "Share SY Migrant, FB Data")

ggsave("output/fb_vs_true_kreis_all.png", last_plot(), width=5, height=4)

kreis_combined_2019.w_hometowns %>%
  mutate(gender = if_else(gender == "f", "Female", "Male")) %>%
  ggplot(aes(x=share_sy_true, y=share_sy_fb, size=n_total)) +
  geom_point(aes(col=gender), alpha=0.6) +
  scale_color_manual(values=c25) +
  geom_smooth(method='lm', se=F, mapping = aes(weight = n_total), show.legend = FALSE, col="gray") +
  theme_classic() +
  scale_size(guide="none") + # hide the size legend
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor, 3)}\n(N = ${nrow(kreis_combined_2019.w_hometowns)})"),
       size="County x \nGender Pop.",
       x = "Share SY Migrant, Admin Data",
       y = "Share SY Migrant, FB Data",
       col = "Gender")

ggsave("output/fb_vs_true_kreis_all_GENDER.png", last_plot(), width=5, height=4.5)



#############################
## 3. Comparison by Lander ##
#############################

# Get the share Syrian by group
lander_demos_2019 <- full_join(

  # Syrian pops
  lander_demos %>%
    filter(year == 2019) %>%
    select(state, starts_with("pop_syr_m_gen_1_"), starts_with("pop_syr_f_gen_1_")) %>%
    gather(grp, n_sy, -state) %>%
    mutate(
      grp = str_remove(grp, "pop_syr_"),
      grp = str_remove(grp, "_gen_1")) %>%
    separate(grp, c("gender","age_bkt"), sep = "\\_", extra = "merge"),

  # Total pops
  lander_demos %>%
    filter(year == 2019) %>%
    select(state, starts_with("pop_m_"), starts_with("pop_f_")) %>%
    gather(grp, n_total, -state) %>%
    mutate(
      grp = str_remove(grp, "pop_")) %>%
    separate(grp, c("gender","age_bkt"), sep = "\\_", extra = "merge"),

  # Merge together
  by = c("state", "gender", "age_bkt")

) %>%
  #Combine a few buckets
  mutate(
    age_bkt = if_else(age_bkt %in% c("18_19", "20_24"), "18_24", age_bkt),
    age_bkt = if_else(age_bkt %in% c("65_74", "74_plus"), "65_plus", age_bkt)) %>%
  group_by(state, gender, age_bkt) %>%
  summarise(
    n_sy = sum(n_sy),
    n_total = sum(n_total))


# Get the master data set we will use to make comparisons
# See the README for instructions on how to access this data
dat_in_lander <- read_csv("input/pipeline/migration_fb_vs_true_lander.csv")

lander_combined_2019 <- dat_in_lander %>%
  left_join(lander_xw, by=c("nuts1"="NUTS1")) %>%
  left_join(lander_demos_2019, by=c("Name"="state", "gender"="gender", "age_bkt"="age_bkt")) %>%
  mutate(share_sy_true = n_sy/n_total)

if(nrow(filter(lander_combined_2019, is.na(n_sy))) > 0) stop("Fix non-matching rows")

# Compare natives (weighted by total pop)
wtd_cor <- wtd.cor(
  lander_combined_2019$share_sy_true,
  lander_combined_2019$share_sy_fb,
  lander_combined_2019$n_total)[1]

ggplot(lander_combined_2019, aes(x=share_sy_true, y=share_sy_fb, size=n_total)) +
  geom_point() +
  geom_smooth(method='lm', se=F, mapping = aes(weight = n_total), show.legend = FALSE) +
  theme_classic() +
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor, 3)}\n(N = ${nrow(lander_combined_2019)})"),
       size="State x \nAge x \nGender Pop.",
       x = "Share SY Migrant, Admin Data",
       y = "Share SY Migrant, FB Data")

ggsave("output/fb_vs_true_lander_loc.png", last_plot(), width=5, height=4)


# Again, look at all data (using hometowns)
# See the README for instructions on how to access this data
dat_in_lander.w_hometowns <- read_csv("input/pipeline/migration_fb_vs_true_lander_all.csv")

lander_combined_2019.w_hometowns <- dat_in_lander.w_hometowns %>%
  left_join(lander_xw, by=c("nuts1"="NUTS1")) %>%
  left_join(lander_demos_2019, by=c("Name"="state", "gender"="gender", "age_bkt"="age_bkt")) %>%
  mutate(share_sy_true = n_sy/n_total)

if(nrow(filter(lander_combined_2019.w_hometowns, is.na(n_sy))) > 0) stop("Fix non-matching rows")

# Compare natives (weighted by total pop)
wtd_cor <- wtd.cor(
  lander_combined_2019.w_hometowns$share_sy_true,
  lander_combined_2019.w_hometowns$share_sy_fb,
  lander_combined_2019.w_hometowns$n_total)[1]

ggplot(lander_combined_2019.w_hometowns, aes(x=share_sy_true, y=share_sy_fb, size=n_total)) +
  geom_point() +
  geom_abline(col="dark gray", linetype = "dashed", size=1.25) +
  geom_smooth(method='lm', se=F, mapping = aes(weight = n_total), show.legend = FALSE) +
  theme_classic() +
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor, 3)}\n(N = ${nrow(lander_combined_2019.w_hometowns)})"),
       size="State x \nAge x \nGender Pop.",
       x = "Share SY Migrant, Admin Data",
       y = "Share SY Migrant, FB Data")

ggsave("output/fb_vs_true_lander_all.png", last_plot(), width=5, height=4)


lander_combined_2019.w_hometowns %>%
  mutate(gender = if_else(gender == "f", "Female", "Male")) %>%
  ggplot(aes(x=share_sy_true, y=share_sy_fb, size=n_total)) +
  geom_point(aes(col=gender), alpha=0.6) +
  scale_color_manual(values=c25) +
  geom_smooth(method='lm', se=F, mapping = aes(weight = n_total), show.legend = FALSE, col="gray") +
  theme_classic() +
  scale_size(guide="none") + # hide the size legend
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor, 3)}\n(N = ${nrow(lander_combined_2019)})"),
       size="State x \nAge x \nGender Pop.",
       x = "Share SY Migrant, Admin Data",
       y = "Share SY Migrant, FB Data",
       col = "Gender")

ggsave("output/fb_vs_true_lander_all_GENDER.png", last_plot(), width=5, height=4.5)


lander_combined_2019.w_hometowns %>%
  mutate(
    age_bkt = str_replace(age_bkt, "_", "-"),
    age_bkt = str_replace(age_bkt, "-plus", "+")) %>%
  ggplot(aes(x=share_sy_true, y=share_sy_fb, size=n_total)) +
  geom_point(aes(col=age_bkt), alpha=0.6) +
  scale_color_manual(values=c25) +
  geom_smooth(method='lm', se=F, mapping = aes(weight = n_total), show.legend = FALSE, col="gray") +
  theme_classic() +
  scale_size(guide="none") + # hide the size legend
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor, 3)}\n(N = ${nrow(lander_combined_2019)})"),
       size="State x \nAge x \nGender Pop.",
       x = "Share SY Migrant, Admin Data",
       y = "Share SY Migrant, FB Data",
       col = "Age")

ggsave("output/fb_vs_true_lander_all_AGE.png", last_plot(), width=5, height=4.5)

lander_combined_2019.w_hometowns %>%
  mutate(
    Name = str_replace(Name, "-", "- \n")) %>%
  ggplot(aes(x=share_sy_true, y=share_sy_fb, size=n_total)) +
  geom_point(aes(col=Name), alpha=0.6) +
  scale_color_manual(values=c(c25[c(1:6)], c25[c(8:25)])) +
  geom_smooth(method='lm', se=F, mapping = aes(weight = n_total), show.legend = FALSE, col="gray") +
  theme_classic() +
  scale_size(guide="none") + # hide the size legend
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor, 3)}\n(N = ${nrow(lander_combined_2019)})"),
       size="State x \nAge x \nGender Pop.",
       x = "Share SY Migrant, Admin Data",
       y = "Share SY Migrant, FB Data",
       col = "State")

ggsave("output/fb_vs_true_lander_all_LANDER.png", last_plot(), width=5, height=4.5)



###############################################
## 4. Comparison Overall -- longitudinal/age ##
###############################################

#### Overall longitudional ####
overall_longitudional <- lander_demos %>%
  select(year, starts_with("pop_syr_m_gen_1_"), starts_with("pop_syr_f_gen_1_")) %>%
  group_by(year) %>%
  summarise_all(sum) %>%
  gather(grp, n_sy, -year) %>%
  mutate(
    grp = str_remove(grp, "pop_syr_"),
    grp = str_remove(grp, "_gen_1")) %>%
  separate(grp, c("gender","age_bkt"), sep = "\\_", extra = "merge") %>%
  mutate(
    age_bkt = if_else(age_bkt %in% c("18_19", "20_24"), "18_24", age_bkt),
    age_bkt = if_else(age_bkt %in% c("65_74", "74_plus"), "65_plus", age_bkt)) %>%
  group_by(year, gender, age_bkt) %>%
  summarise(n_sy = sum(n_sy))


# Get the master data set we will use to make comparisons
# See the README for instructions on how to access this data
user_dat_in <- read_csv("input/pipeline/migration_fb_vs_true_user_dat.csv") 

overall_combined_longitudional <- user_dat_in %>%
  inner_join(overall_longitudional, by=c("year"="year", "gender"="gender", "age_bkt"))

# Compare
cor_to_use <- cor(
  overall_combined_longitudional$n_sy,
  overall_combined_longitudional$n_fb_sy)

ggplot(overall_combined_longitudional, aes(x=log(n_sy), y=log(n_fb_sy))) +
  geom_point(aes(col=as.factor(year)), size=4, alpha=0.75) +
  scale_color_manual(values=c25) +
  geom_smooth(method='lm', se=F, show.legend = FALSE, col="gray") +
  theme_classic() +
  labs(title = str_interp("Correlation = ${round(cor_to_use, 3)}\n(N = ${nrow(overall_combined_longitudional)})"),
       col = "Year",
       x = "log(SY Pop., Admin Data)",
       y = "log(N SY Migrants, FB Data)")

ggsave("output/fb_vs_true_longitudinal_YEAR.png", last_plot(), width=5, height=4)

overall_combined_longitudional %>%
  mutate(
    age_bkt = str_replace(age_bkt, "_", "-"),
    age_bkt = str_replace(age_bkt, "-plus", "+")) %>%
  ggplot(aes(x=log(n_sy), y=log(n_fb_sy))) +
  geom_point(aes(col=age_bkt), size=4, alpha=0.75) +
  scale_color_manual(values=c25) +
  geom_smooth(method='lm', se=F, show.legend = FALSE, col="gray") +
  theme_classic() +
  labs(title = str_interp("Correlation = ${round(cor_to_use, 3)}\n(N = ${nrow(overall_combined_longitudional)})"),
       col = "Age",
       x = "log(SY Pop., Admin Data)",
       y = "log(N SY Migrants, FB Data)")

ggsave("output/fb_vs_true_longitudinal_AGE.png", last_plot(), width=5, height=4)

overall_combined_longitudional %>%
  mutate(gender = if_else(gender == "f", "Female", "Male")) %>%
  ggplot(aes(x=log(n_sy), y=log(n_fb_sy))) +
  geom_point(aes(col=gender), size=4, alpha=0.75) +
  scale_color_manual(values=c25) +
  geom_smooth(method='lm', se=F, show.legend = FALSE, col="gray") +
  theme_classic() +
  labs(title = str_interp("Correlation = ${round(cor_to_use, 3)}\n(N = ${nrow(overall_combined_longitudional)})"),
       col = "Gender",
       x = "log(SY Pop., Admin Data)",
       y = "log(N SY Migrants, FB Data)")

ggsave("output/fb_vs_true_longitudinal_GENDER.png", last_plot(), width=5, height=4)




###########################
## 5. Native by Kreis ##
###########################

kreis_demos_2019.native <- full_join(

  # Syrian pops
  kreis_demos %>%
    filter(year == 2019) %>%
    mutate(
      share_native_m = 1-frac_for_m,
      share_native_f = 1-frac_for_f) %>%
    select(ags, share_native_m, share_native_f) %>%
    gather(gender, share_native_admin, -ags) %>%
    mutate(
      gender = str_remove(gender, "share_native_")) %>%
    left_join(kreis_xw, by=c("ags"="ID")) %>%
    filter(!ags %in% kreis_to_exclude),

  # Total pops
  kreis_demos %>%
    filter(!is.na(pop_syr_m) & !is.na(pop_syr_f)) %>%
    filter(year == 2019) %>%
    select(ags, "pop_m_tot", "pop_f_tot") %>%
    gather(gender, n_total, -ags) %>%
    mutate(
      gender = str_remove(gender, "pop_"),
      gender = str_remove(gender, "_tot")),

  # Merge together
  by = c("ags", "gender")
)

# See the README for instructions on how to access this data
dat_in_kreis_all.native <- read_csv("input/pipeline/migration_fb_vs_true_kreis_native.csv")

# Combine the data
kreis_combined_all.native <- dat_in_kreis_all.native %>%
  left_join(kreis_demos_2019.native, by=c("nuts3"="NUTS3"))

# Compare natives (weighted by total pop)
wtd_cor <- wtd.cor(
  kreis_combined_all.native$share_native_admin,
  kreis_combined_all.native$share_native_fb,
  kreis_combined_all.native$n_total)[1]

ggplot(kreis_combined_all.native, aes(x=share_native_admin, y=share_native_fb, size=n_total)) +
  geom_point() +
  geom_smooth(method='lm', se=F, mapping = aes(weight = n_total), show.legend = FALSE) +
  theme_classic() +
  # geom_abline(slope = 1, linetype = "dashed") +
  labs(title = str_interp("Weighted Correlation = ${round(wtd_cor, 3)}\n(N = ${sum(complete.cases(kreis_combined_all.native))})"),
       size="County x\nGender Pop.",
       x = "Share DE Native, Admin Data",
       y = "Share DE Native, FB Data")

ggsave("output/fb_vs_true_kreis_all_NATIVE.png", last_plot(), width=5, height=4)
